//  link_check implementation  -----------------------------------------------//

//  Copyright Beman Dawes 2002.
//
//  SPDX-License-Identifier: BSL-1.0
//  Distributed under the Boost Software License, Version 1.0.
//  (See accompanying file LICENSE_1_0.txt or copy at
//  http://www.boost.org/LICENSE_1_0.txt)

#include <hpx/config.hpp>
#include <hpx/modules/filesystem.hpp>
#include <hpx/modules/string_util.hpp>

#include <cstdlib>
#include <set>
#include "boost/regex.hpp"
#include "function_hyper.hpp"
#include "link_check.hpp"

// #include <iostream>

namespace fs = hpx::filesystem;

namespace {
    boost::regex html_bookmark_regex(
        "<([^\\s<>]*)\\s*[^<>]*\\s+(NAME|ID)\\s*=\\s*(['\"])(.*?)\\3"
        "|<!--.*?-->",
        boost::regbase::normal | boost::regbase::icase);
    boost::regex html_url_regex(
        "<([^\\s<>]*)\\s*[^<>]*\\s+(?:HREF|SRC)"    // HREF or SRC
        "\\s*=\\s*(['\"])\\s*(.*?)\\s*\\2"
        "|<!--.*?-->",
        boost::regbase::normal | boost::regbase::icase);
    boost::regex css_url_regex(
        "(\\@import\\s*[\"']|url\\s*\\(\\s*[\"']?)([^\"')]*)"
        "|/\\*.*?\\*/",
        boost::regbase::normal | boost::regbase::icase);

    // Regular expression for parsing URLS from:
    // http://tools.ietf.org/html/rfc3986#appendix-B
    boost::regex url_decompose_regex(
        "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?$",
        boost::regbase::normal);

    typedef std::set<std::string> bookmark_set;
    bookmark_set bookmarks;
    bookmark_set
        bookmarks_lowercase;    // duplicate check needs case insensitive

    // Decode html escapsed ampersands, returns an empty string if there's an error.
    std::string decode_ampersands(std::string const& url_path)
    {
        std::string::size_type pos = 0, next;
        std::string result;
        result.reserve(url_path.length());

        while ((next = url_path.find('&', pos)) != std::string::npos)
        {
            result.append(url_path, pos, next - pos);
            pos = next;
            if (url_path.substr(pos, 5) == "&amp;")
            {
                result += '&';
                pos += 5;
            }
            else
            {
                result += '&';
                pos += 1;
            }
            break;
        }

        result.append(url_path, pos, url_path.length());

        return result;
    }

    // Decode percent encoded characters, returns an empty string if there's an error.
    std::string decode_percents(std::string const& url_path)
    {
        std::string::size_type pos = 0, next;
        std::string result;
        result.reserve(url_path.length());

        while ((next = url_path.find('%', pos)) != std::string::npos)
        {
            result.append(url_path, pos, next - pos);
            pos = next;
            switch (url_path[pos])
            {
            case '%':
            {
                if (url_path.length() - next < 3)
                    return "";
                char hex[3] = {url_path[next + 1], url_path[next + 2], '\0'};
                char* end_ptr;
                result += (char) std::strtol(hex, &end_ptr, 16);
                if (*end_ptr)
                    return "";
                pos = next + 3;
                break;
            }
            }
        }

        result.append(url_path, pos, url_path.length());

        return result;
    }

    bool is_css(const path& p)
    {
        return p.extension() == ".css";
    }

}    // unnamed namespace

namespace boost { namespace inspect {

    //  link_check constructor  --------------------------------------------------//

    link_check::link_check()
      : m_broken_errors(0)
      , m_unlinked_errors(0)
      , m_invalid_errors(0)
      , m_bookmark_errors(0)
      , m_duplicate_bookmark_errors(0)
    {
        // HTML signatures are already registered by the base class,
        // 'hypertext_inspector'
        register_signature(".css");
    }

    //  inspect (all)  -----------------------------------------------------------//

    void link_check::inspect(
        const string& /*library_name*/, const path& full_path)
    {
        // keep track of paths already encountered to reduce disk activity
        if (!fs::is_directory(full_path))
            m_paths[relative_to(full_path, search_root_path())] |= m_present;
    }

    //  inspect ( .htm, .html, .shtml, .css )  -----------------------------------//

    void link_check::inspect(const string& library_name,
        const path& full_path,     // example: c:/foo/boost/filesystem/path.hpp
        const string& contents)    // contents of file to be inspected
    {
        if (contents.find("hpxinspect:"
                          "nounlinked") != string::npos)
            m_paths[relative_to(full_path, search_root_path())] |=
                m_nounlinked_errors;

        bool no_link_errors = (contents.find("hpxinspect:"
                                             "nolink") != string::npos);

        // build bookmarks databases
        bookmarks.clear();
        bookmarks_lowercase.clear();
        string::const_iterator a_start(contents.begin());
        string::const_iterator a_end(contents.end());
        boost::match_results<string::const_iterator> a_what;
        boost::match_flag_type a_flags = boost::match_default;

        if (!is_css(full_path))
        {
            string previous_id;

            while (boost::regex_search(
                a_start, a_end, a_what, html_bookmark_regex, a_flags))
            {
                // a_what[0] contains the whole string iterators.
                // a_what[1] contains the tag iterators.
                // a_what[2] contains the attribute name.
                // a_what[4] contains the bookmark iterators.

                if (a_what[4].matched)
                {
                    string tag(a_what[1].first, a_what[1].second);
                    hpx::string_util::to_lower(tag);
                    string attribute(a_what[2].first, a_what[2].second);
                    hpx::string_util::to_lower(attribute);
                    string bookmark(a_what[4].first, a_what[4].second);

                    bool name_following_id =
                        (attribute == "name" && previous_id == bookmark);
                    if (tag != "meta" && attribute == "id")
                        previous_id = bookmark;
                    else
                        previous_id.clear();

                    if (tag != "meta" && !name_following_id)
                    {
                        bookmarks.insert(bookmark);
                        //              std::cout << "******************* " << bookmark << '\n';

                        // w3.org recommends case-insensitive checking for duplicate bookmarks
                        // since some browsers do a case-insensitive match.
                        string bookmark_lowercase(bookmark);
                        hpx::string_util::to_lower(bookmark_lowercase);

                        std::pair<bookmark_set::iterator, bool> result =
                            bookmarks_lowercase.insert(bookmark_lowercase);
                        if (!result.second)
                        {
                            ++m_duplicate_bookmark_errors;
                            std::size_t ln = std::count(contents.begin(),
                                                 a_what[3].first, '\n') +
                                1;
                            error(library_name, full_path,
                                "Duplicate bookmark: " + bookmark, ln);
                        }
                    }
                }

                a_start = a_what[0].second;            // update search position
                a_flags |= boost::match_prev_avail;    // update flags
                a_flags |= boost::match_not_bob;
            }
        }

        // process urls
        string::const_iterator start(contents.begin());
        string::const_iterator end(contents.end());
        boost::match_results<string::const_iterator> what;
        boost::match_flag_type flags = boost::match_default;

        if (!is_css(full_path))
        {
            while (boost::regex_search(start, end, what, html_url_regex, flags))
            {
                // what[0] contains the whole string iterators.
                // what[1] contains the element type iterators.
                // what[3] contains the URL iterators.

                if (what[3].matched)
                {
                    string type(what[1].first, what[1].second);
                    hpx::string_util::to_lower(type);

                    // TODO: Complain if 'link' tags use external stylesheets.
                    do_url(string(what[3].first, what[3].second), library_name,
                        full_path, no_link_errors,
                        type == "a" || type == "link", contents.begin(),
                        what[3].first);
                }

                start = what[0].second;              // update search position
                flags |= boost::match_prev_avail;    // update flags
                flags |= boost::match_not_bob;
            }
        }

        while (boost::regex_search(start, end, what, css_url_regex, flags))
        {
            // what[0] contains the whole string iterators.
            // what[2] contains the URL iterators.

            if (what[2].matched)
            {
                do_url(string(what[2].first, what[2].second), library_name,
                    full_path, no_link_errors, false, contents.begin(),
                    what[3].first);
            }

            start = what[0].second;              // update search position
            flags |= boost::match_prev_avail;    // update flags
            flags |= boost::match_not_bob;
        }
    }

    //  do_url  ------------------------------------------------------------------//

    void link_check::do_url(const string& url, const string& library_name,
        const path& source_path, bool no_link_errors,
        bool allow_external_content, std::string::const_iterator contents_begin,
        std::string::const_iterator url_start)
    // precondition: source_path.is_complete()
    {
        if (!no_link_errors && url.empty())
        {
            ++m_invalid_errors;
            std::size_t ln = std::count(contents_begin, url_start, '\n') + 1;
            error(library_name, source_path, "Empty URL.", ln);
            return;
        }

        // Decode ampersand encoded characters.
        string decoded_url = is_css(source_path) ? url : decode_ampersands(url);
        if (decoded_url.empty())
        {
            if (!no_link_errors)
            {
                ++m_invalid_errors;
                std::size_t ln =
                    std::count(contents_begin, url_start, '\n') + 1;
                error(library_name, source_path,
                    "Invalid URL (invalid ampersand encodings): " + url, ln);
            }
            return;
        }

        boost::smatch m;
        if (!boost::regex_match(decoded_url, m, url_decompose_regex))
        {
            if (!no_link_errors)
            {
                ++m_invalid_errors;
                std::size_t ln =
                    std::count(contents_begin, url_start, '\n') + 1;
                error(library_name, source_path, "Invalid URL: " + decoded_url,
                    ln);
            }
            return;
        }

        bool scheme_matched = m[2].matched, authority_matched = m[4].matched,
             //query_matched = m[7].matched,
            fragment_matched = m[9].matched;

        std::string scheme(m[2]), authority(m[4]), url_path(m[5]),
            //query(m[7]),
            fragment(m[9]);

        // Check for external content
        if (!allow_external_content && (authority_matched || scheme_matched))
        {
            if (!no_link_errors)
            {
                ++m_invalid_errors;
                std::size_t ln =
                    std::count(contents_begin, url_start, '\n') + 1;
                error(library_name, source_path,
                    "External content: " + decoded_url, ln);
            }
        }

        // Protocol checks
        if (scheme_matched)
        {
            if (scheme == "http" || scheme == "https")
            {
                // All http links should have a hostname. Generally if they don't
                // it's by mistake. If they shouldn't, then a protocol isn't
                // required.
                if (!authority_matched)
                {
                    if (!no_link_errors)
                    {
                        ++m_invalid_errors;
                        std::size_t ln =
                            std::count(contents_begin, url_start, '\n') + 1;
                        error(library_name, source_path,
                            "No hostname: " + decoded_url, ln);
                    }
                }

                return;
            }
            else if (scheme == "file")
            {
                if (!no_link_errors)
                {
                    ++m_invalid_errors;
                    std::size_t ln =
                        std::count(contents_begin, url_start, '\n') + 1;
                    error(library_name, source_path,
                        "Invalid URL (hardwired file): " + decoded_url, ln);
                }
            }
            else if (scheme == "mailto" || scheme == "ftp" ||
                scheme == "news" || scheme == "javascript")
            {
                if (!no_link_errors && is_css(source_path))
                {
                    ++m_invalid_errors;
                    std::size_t ln =
                        std::count(contents_begin, url_start, '\n') + 1;
                    error(library_name, source_path,
                        "Invalid protocol for css: " + decoded_url, ln);
                }
            }
            else
            {
                if (!no_link_errors)
                {
                    ++m_invalid_errors;
                    std::size_t ln =
                        std::count(contents_begin, url_start, '\n') + 1;
                    error(library_name, source_path,
                        "Unknown protocol: '" + scheme +
                            "' in url: " + decoded_url,
                        ln);
                }
            }

            return;
        }

        // Hostname without protocol.
        if (authority_matched)
        {
            if (!no_link_errors)
            {
                ++m_invalid_errors;
                std::size_t ln =
                    std::count(contents_begin, url_start, '\n') + 1;
                error(library_name, source_path,
                    "Invalid URL (hostname without protocol): " + decoded_url,
                    ln);
            }
        }

        // Check the fragment identifier
        if (fragment_matched)
        {
            if (is_css(source_path))
            {
                if (!no_link_errors)
                {
                    ++m_invalid_errors;
                    std::size_t ln =
                        std::count(contents_begin, url_start, '\n') + 1;
                    error(library_name, source_path,
                        "Fragment link in CSS: " + decoded_url, ln);
                }
            }
            else
            {
                if (!no_link_errors && fragment.find('#') != string::npos)
                {
                    ++m_bookmark_errors;
                    std::size_t ln =
                        std::count(contents_begin, url_start, '\n') + 1;
                    error(library_name, source_path,
                        "Invalid bookmark: " + decoded_url, ln);
                }
                else if (!no_link_errors && url_path.empty() &&
                    !fragment.empty()
                    // w3.org recommends case-sensitive broken bookmark checking
                    // since some browsers do a case-sensitive match.
                    && bookmarks.find(decode_percents(fragment)) ==
                        bookmarks.end())
                {
                    ++m_broken_errors;
                    std::size_t ln =
                        std::count(contents_begin, url_start, '\n') + 1;
                    error(library_name, source_path,
                        "Unknown bookmark: " + decoded_url, ln);
                }
            }

            // No more to do if it's just a fragment identifier
            if (url_path.empty())
                return;
        }

        // Detect characters banned by RFC2396:
        if (!no_link_errors &&
            decoded_url.find_first_of(" <>\"{}|\\^[]'") != string::npos)
        {
            ++m_invalid_errors;
            std::size_t ln = std::count(contents_begin, url_start, '\n') + 1;
            error(library_name, source_path,
                "Invalid character in URL: " + decoded_url, ln);
        }

        // Check that we actually have a path.
        if (url_path.empty())
        {
            if (!no_link_errors)
            {
                ++m_invalid_errors;
                std::size_t ln =
                    std::count(contents_begin, url_start, '\n') + 1;
                error(library_name, source_path,
                    "Invalid URL (empty path in relative url): " + decoded_url,
                    ln);
            }
        }

        // Decode percent encoded characters.
        string decoded_path = decode_percents(url_path);
        if (decoded_path.empty())
        {
            if (!no_link_errors)
            {
                ++m_invalid_errors;
                std::size_t ln =
                    std::count(contents_begin, url_start, '\n') + 1;
                error(library_name, source_path,
                    "Invalid URL (invalid character encodings): " + decoded_url,
                    ln);
            }
            return;
        }

        // strip url of references to current dir
        if (decoded_path[0] == '.' && decoded_path[1] == '/')
            decoded_path.erase(0, 2);

        // url is relative source_path.branch()
        // convert to target_path, which is_complete()
        path target_path;
        try
        {
            target_path = source_path.parent_path() /= path(decoded_path);
        }
        catch (fs::filesystem_error const&)
        {
            if (!no_link_errors)
            {
                std::size_t ln =
                    std::count(contents_begin, url_start, '\n') + 1;
                ++m_invalid_errors;
                error(library_name, source_path,
                    "Invalid URL (error resolving path): " + decoded_url, ln);
            }
            return;
        }

        // create a m_paths entry if necessary
        std::pair<const string, int> entry(
            relative_to(target_path, search_root_path()), 0);
        m_path_map::iterator itr(m_paths.find(entry.first));
        if (itr == m_paths.end())
        {
            if (fs::exists(target_path))
                entry.second = m_present;
            itr = m_paths.insert(entry).first;
        }

        // itr now points to the m_paths entry
        itr->second |= m_linked_to;

        // if target isn't present, the link is broken
        if (!no_link_errors && (itr->second & m_present) == 0)
        {
            ++m_broken_errors;
            std::size_t ln = std::count(contents_begin, url_start, '\n') + 1;
            error(library_name, source_path, "Broken link: " + decoded_url, ln);
        }
    }

    //  close  -------------------------------------------------------------------//

    void link_check::close()
    {
        for (m_path_map::const_iterator itr = m_paths.begin();
             itr != m_paths.end(); ++itr)
        {
            // std::clog << itr->first << " " << itr->second << "\n";
            if ((itr->second & m_linked_to) != m_linked_to &&
                (itr->second & m_nounlinked_errors) != m_nounlinked_errors &&
                (itr->first.rfind(".html") == itr->first.size() - 5 ||
                    itr->first.rfind(".htm") == itr->first.size() - 4 ||
                    itr->first.rfind(".css") == itr->first.size() - 4)
                // because they may be redirectors, it is OK if these are unlinked:
                && itr->first.rfind("index.html") == string::npos &&
                itr->first.rfind("index.htm") == string::npos)
            {
                ++m_unlinked_errors;
                path full_path(search_root_path() / path(itr->first));
                error(impute_library(full_path), full_path,
                    loclink(full_path, "Unlinked file"));
            }
        }
    }

}}    // namespace boost::inspect
